---
title: "Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Appendix A"
output: html_notebook
---

This is the R code used for the data analyses presented in Appendix A of the article "Bureaucratic Responsiveness in Humanitarian Aid".

The data here is used to replicate Tables A1, A2, A5, and A6. The replication data for the topic model, including that used to generate Tables A3 and A4 can be found in the topic model replication files. 


First, load the environment and main data set. 
```{r}
Sys.setenv(LANGUAGE = "en")


library(dplyr)
library(ggplot2)
library(fixest)
library(broom)
library(mice)
library(miceadds)
library(purrr)
library(tidyr)
library(stringr)
library(ggcorrplot)
library(stargazer)

data <- read.csv(file = "maindata_fpa_replication.csv")
```


Log transform all numeric variables. 
```{r}
#first replace 0 values within affected_ofda with NA, as these values are missing data points. 
data$affected_ofda[data$affected_ofda == 0] <- NA

numeric_vars <- sapply(data, is.numeric)

for (var in names(data)[numeric_vars]) {
  temp <- data[[var]]
  
  temp[temp == 0] <- 0.1
  
  if (var == "lag_trade_bal") {
    data[[paste0("log_", var)]] <- sign(temp) * log(abs(temp))
  } else {
    temp[temp < 0 | is.na(temp)] <- NA
    data[[paste0("log_", var)]] <- log(temp)
  }
}
```


Create a table of summary statistics

These results should match those found in "Table A1. Summary Statistics for All Variables".
```{r}

sum_data <- data %>% select(-case, -year)

summary_table <- sum_data %>%
  summarise(across(everything(), 
                   list(
                     Mean = ~mean(.x, na.rm = TRUE),
                     SD = ~sd(.x, na.rm = TRUE),
                     Min = ~min(.x, na.rm = TRUE),
                     Max = ~max(.x, na.rm = TRUE),
                     Median = ~median(.x, na.rm = TRUE),
                     N = ~sum(!is.na(.x))
                   ), 
                   .names = "{.col}_{.fn}"))

summary_table_long <- summary_table %>%
  pivot_longer(cols = everything(),
               names_to = "name",
               values_to = "Value") %>%
  mutate(
    Statistic = str_extract(name, "[^_]+$"),             
    Variable = str_replace(name, "_[^_]+$", "")          
  ) %>%
  select(Variable, Statistic, Value) %>%
  pivot_wider(names_from = Statistic, values_from = Value)

variable_order <- c("adj_ofda", "med_words", "org_med_words", "med_h", "affected_ofda",
                   "lag_trade_bal", "lag_adj_gdp", "lag_pop", "ch_words", "troops",
                   "log_adj_ofda", "log_med_words", "log_org_med_words", "log_med_h",
                   "log_affected_ofda", "log_lag_trade_bal", "log_lag_adj_gdp", 
                   "log_lag_pop", "log_ch_words", "log_troops")

summary_table_long <- summary_table_long %>%
  mutate(Variable = factor(Variable, levels = variable_order)) %>%
  arrange(Variable)


print(summary_table_long)


```


Create a correlation matrix for all main logged variables.

These results should match those found in "Table A2. Correlation Matrix". 
```{r}
cor_data <- data %>% 
  select(log_adj_ofda, log_med_words, log_med_h,
         log_ch_words, log_affected_ofda, log_lag_trade_bal,
         log_lag_adj_gdp, log_lag_pop)

cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")

# Print the correlation matrix
print(cor_matrix)

stargazer(cor_matrix, type = "text", title = "Correlation Matrix")
```

Fit a regression model to the data without country fixed effects. 

These results should match those found in "Table A5. Estimated Coefficients of Aid Without Country Fixed Effects"

```{r}
#center variables
vars_to_center <- c("log_med_h", "log_med_words", "log_ch_words", "log_affected_ofda", 
                    "log_lag_trade_bal", "log_lag_adj_gdp", "log_lag_pop")

data <- data %>%
  mutate(across(all_of(vars_to_center), ~ as.numeric(scale(. , center = TRUE, scale = FALSE)), .names = "centered_{.col}"))

data <- data %>% select(case, year, log_adj_ofda, log_med_h,
                        log_med_words, log_ch_words,
                        log_affected_ofda, log_lag_trade_bal, log_lag_adj_gdp,
                        log_lag_pop, centered_log_med_h, centered_log_med_words,
                        centered_log_ch_words, centered_log_affected_ofda,
                        centered_log_lag_trade_bal, centered_log_lag_adj_gdp,
                        centered_log_lag_pop)

#run the multiple imputation regressions
tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)



mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

#extract coefficients, se, p-values and compile into a table
process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column

  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))

print(formatted_table)


#calculate r-squared
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]

    # Ensure complete cases for all variables in the model
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)

    # Extract response variable
    y <- complete_data$log_adj_ofda

    # Construct design matrix
    X <- model.matrix(as.formula(formula), data = complete_data)

    # Extract coefficients
    beta <- coef(model)

    # Ensure column names in X match the coefficient names in beta
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]

    # Compute fitted values
    y_hat <- X %*% beta  

    # Compute R-squared
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })

  # Pooling R^2 using Rubin’s Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var

  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)

  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
   factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R² for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

# Convert results into a data frame for easy viewing
r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# Print results
print(r2_df)
```

Create another set of regressions, excluding the outlier cases of Syria, Iraq, and Afghanistan. 

The results should match those presented in "Table A6. Estimated Coefficients of Aid Excluding Iraq, Syria, and Afghanistan Cases"

```{r}

#filter outliers
data <- data[!grepl("Iraq|Afghan|Syria", data$case), ]

#run the multiple imputation regressions
tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)



mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

#extract coefficients, se, p-values and compile into a table
process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column

  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))

print(formatted_table)


#calculate r-squared
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]

    # Ensure complete cases for all variables in the model
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)

    # Extract response variable
    y <- complete_data$log_adj_ofda

    # Construct design matrix
    X <- model.matrix(as.formula(formula), data = complete_data)

    # Extract coefficients
    beta <- coef(model)

    # Ensure column names in X match the coefficient names in beta
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]

    # Compute fitted values
    y_hat <- X %*% beta  

    # Compute R-squared
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })

  # Pooling R^2 using Rubin’s Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var

  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)

  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
   factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R² for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

# Convert results into a data frame for easy viewing
r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# Print results
print(r2_df)
```

